##------------------------------------------------------------------------------##
##  Replication Materials for: "The World Bank as an Enforcer of Human Rights?" ##
##  Latent variable estimates                                                   ##
##  Complaint Fundamentals                                                      ##
##  Kelebogile Zvobgo and Benjamin A.T. Graham                                  ##
##  Adapted from code written by Michael Kenwick and Christopher Fariss         ##
##  Prepared for public posting January 13, 2020                                ##
##------------------------------------------------------------------------------##



# install packages
install.packages("foreign")
install.packages("StanHeaders")
install.packages("rstan", repos = "https://cloud.r-project.org/", dependencies = TRUE)
pkgbuild::has_build_tools(debug = TRUE)
install.packages("ggplot2")
install.packages("gridExtra")
install.packages("devtools")
install.packages("edstan")
install.packages("psych")



# load packages
library(foreign)
library("rstan") # observe startup messages
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
library(ggplot2)
library(gridExtra)  
library(devtools) 
library(edstan)
library(psych)



# load data
setwd("/Volumes/GoogleDrive/My Drive/IFIs_and_HR/Data/Replication materials")
mydata <- read.dta("2_WBHR_Data_zg2020.dta")
dim(mydata)



# variable selection
data_complaint <- subset(mydata, complaint==1)
data <- subset(data_complaint, select=c(wb_hr_id,
                              gwno, 
                              year, 
                              raw_data,
                              personal_testimony,
                              petition, 
                              typed, 
                              english, 
                              bank_policy,
                              bank_policy_viol))



### Histogram of the raw score distribution and 
### Bar graph of the proportion of correct responses by item

# The data set is available from the edstan package.
preview_rows <- seq(from = 1, to = nrow(data), length.out = 10)
data[preview_rows, ]

# Record existing plot presets and prepare to make side-by-side pots
par_bkp <- par(no.readonly = TRUE)
par(mfrow = c(1, 2))

# Left plot
complaint_scores <- apply(data[, 4:10], 1, sum, na.rm = TRUE)
complaint_counts <- table(complaint_scores)
barplot(complaint_counts, main = "Raw score distribution", xlab = "Raw score", 
        ylab = "Number of complaints")

# Right plot
item_scores <- apply(data[, 4:10], 2, mean, na.rm = TRUE)
barplot(item_scores, main = "Proportion correct by item", ylab = "Proportion correct", 
        ylim = c(0, 1), xaxt = "n")
# x-axis with angled labels
text(x = 0.85 + (1:length(item_scores) - 1) * 1.2, y = -0.05, labels = names(item_scores), 
     xpd = TRUE, srt = 30, pos = 2)

### Plot description:
# The raw score distribution shows that 1 complaint has all "incorrect" response patterns
# and 9 complaints have all "correct" response patterns. Complaints with the highest and 
# lowest raw scores contribute less to our estimation. Estimating the ability parameter for
# these complaints is more difficult than for all others. We then turn to the proportion of
# "correct" responses by item to describe these items' 'difficulty'. Typed complaints are easiest,
# whereas petitions and personal testimonies are the most difficult. For the most part, items
# have a mix of correct and incorrect responses. We can, therefore, estimate alphas and betas
# for all items.

    # ***Due to space constraints, this figure is not included in the paper*** #
    #    We include it here for readers' perusal and re-use of our code.       #



# Return to previous plot presets
par(par_bkp)



### Difficulty and Discrimination Parameters

# Recreate the item name index and use it to check the structure
# of the unscored reading items

# The strict.width argument is optional, making sure the results fit
# in the console window

ritems <- c("raw_data", "personal_testimony","petition", "typed", 
            "english", "bank_policy", "bank_policy_viol")
str(data[, ritems], strict.width = "cut")
apply(data[, ritems], 2, table, na = NULL)


## ITEM DIFFICULTY: How difficult is each item in our sample/what is the mean level of 
## performance?

round(colMeans(data[, ritems], na.rm = T), 2)
    # Two items stands out as having a very low p-values, personal_testimony and petition. 
    # Only 21% and 22% of complaints include testimony of project-affected
    # communities and petitions, respectively.
    # Thus, these are more difficult items than, say, presenting a typed complaint or
    # alleging a violation of WB policy.


## ITEM DISCRIMINATION: How well does the item discriminate between high and low complaint 
## fundamentals scorers?

# Get total complaint scores and check descriptives
data$rtotal <- apply(data[, 4:10], 1, sum, na.rm = TRUE)
describe(data$rtotal)

data_comp <- data[, c(ritems, "rtotal")]
round(cor(data_comp), 2)

# Scatter plots for visualizing item discrimination
p1 <- ggplot(data_comp, aes(rtotal, factor(raw_data))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))
p2 <-ggplot(data_comp, aes(rtotal, factor(personal_testimony))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))
p3 <- ggplot(data_comp, aes(rtotal, factor(petition))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))
p4 <- ggplot(data_comp, aes(rtotal, factor(typed))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))
p5 <- ggplot(data_comp, aes(rtotal, factor(english))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))
p6 <- ggplot(data_comp, aes(rtotal, factor(bank_policy))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))
p7 <- ggplot(data_comp, aes(rtotal, factor(bank_policy_viol))) +
  geom_point(position = position_jitter(w = 0.2, h = 0.2))


# Multiplot
source("http://peterhaschke.com/Code/multiplot.R")
multiplot(p1, p2, p3, p4, p5, p6, p7, cols=3)
    # bank_policy_viol has the highest discrimination. Thus, it gives us the most information
    # about complaint fundamentals, our construct of interest.
   
    # ***Due to space constraints, this figure is not included in the paper*** #
    #    We include it here for readers' perusal and re-use of our code.       #



##### LATENT VARIABLE #####

# -------------------------define STAN model------------------------- #
model <- "
data {
int<lower=0> n; // number of subject observations
int<lower=0> k; // number of items
int<lower=0> n_k; // number of non-missing item-subject observations

int<lower=0> item[n_k]; // vector of length n_k that distingishes the value of the k item/model parameters
int<lower=0> id[n_k]; // vector of length n_k that distingishes the value of the n latent variable parameters

int<lower=0> y[n_k];
}
transformed data{

}
parameters {
vector[k] alpha; //discrimination for item i
real beta[k]; // difficulty for item i
vector[n] x;
//real<lower=0> sigma[k];
}
transformed parameters {
//vector[n_k] alpha_star; // column vector of alpha parameters
//vector[n_k] beta_star; // column vector of alpha parameters
//real<lower=0> beta_star[n_k]; // column vector of beta parameters
//real<lower=0> sigma_star[n_k]; // column vector of sigma parameters
//vector[n_k] x_star; // column vector of latent variable parameters
//vector[n_k] xb; // column vector of linear combination of parameter column vectors

//for(ii in 1:n_k){
// loop through parameters vectors to generate column vectors
//alpha_star[ii] = alpha[item[ii]];
//beta_star[ii] = beta[item[ii]];
// sigma_star[ii] = sigma[item[ii]];
//x_star[ii] = x[id[ii]];
//}

}
model {

alpha ~ normal(0,10);
for(j in 1:1){
beta[j] ~ gamma(1,1);
}
for(j in 2:3){
beta[j] ~ normal(0,1);
}
for(j in 4:4){
beta[j] ~ gamma(1,1);
}
for(j in 5:5){
beta[j] ~ normal(0,1);
}
for(j in 6:7){
beta[j] ~ gamma(1,1);
}

x ~ normal(0,1);

for(ii in 1:n_k){

y[ii] ~ bernoulli_logit(alpha[item[ii]] + beta[item[ii]] * x[id[ii]]);

}

}
"
# -------------------------------------------------- #



# make a matrix of the item response
y <- as.matrix(subset(data, select=c(-wb_hr_id, -gwno, -year, -rtotal)))

# dimension indicators
k <- ncol(y)
n <- nrow(y)
# make a column vector of the item response with missing values excluded
y_missing <- which(!is.na(y))
#y <- as.vector(as.matrix(y))
y <- y[y_missing]
length(y)
# scalar for the total number of subjects by observed items
n_k <- length(y)
n_k
# make a column vector of the item numbers with missing values excluded
item <- matrix(c(1:k), ncol=k, nrow=n, byrow=T)
item <- c(item)
item <- item[y_missing]
# make a column vector of the subject ids with missing values excluded
id <- matrix(1:n,ncol=k,nrow=n, byrow=F)
id <- c(id)
id <- id[y_missing]
# create data list to pass to STAN
data.list <- list(y=y, k=k, n=n, n_k=n_k, item=item, id=id)
# fit stan mode
fit <- stan(model_code = model, data = data.list, iter = 500, chains = 3, control=list(adapt_delta=0.9), thin=5)
# print fit object
fit
# extract draws from stan model object (aka, fit from line above)
output <- extract(fit, permuted = TRUE)
# print names
names(output)



# this prints the posterior mean for the latent variable
hist(apply(output$x,2,mean), main="Distribution of the Latent Variable", 
     xlab="Complaint Fundamentals", ylim=c(0,50))
   
     # ***Due to space constraints, this figure is not included in the paper*** #
     #    We include it here for readers' perusal and re-use of our code.       #

#Theta_s is the posterior mean for each observation
data$theta_fundamentals <-  apply(output$x, 2, mean) # taking means of simulated thetas; mean of 3 chains
data$theta_fundamentals_sd <-  apply(output$x, 2, sd) # taking sds of simulated thetas; mean of 3 chains
data$theta_up_s <- apply(output$x, 2, quantile, 0.975) # what is the number that marks the 95 CI onthe upper bound
data$theta_lw_s <- apply(output$x, 2, quantile, 0.025) # what is the number that marks the 95 CI on the upper bound

print(names(output))

fundamentals_theta <- as.matrix(subset(data,select = c(wb_hr_id,theta_fundamentals, theta_fundamentals_sd)))
fundamentals_theta <- data.frame(fundamentals_theta)

write.dta(data = fundamentals_theta, file = "fundamentals_theta.dta")

# -------------------------------------------------------------------------------------#



# print the latent variable
cbind(apply(output$x,2,mean), apply(output$x,2,sd))



# -------------------------------------------------------------------------------------#

# Figure B1: Complaint Fundamentals: Additive Scale
par(mar=c(4,4,1,1), font=2, font.lab=2, cex=1.3)
plot(data$rtotal, apply(output$x,2,mean), xlim=c(0,6), ylim=c(-3.75,3.75), ylab="Posterior Estimates", xlab="Additive Scale")
abline(reg=lm(apply(output$x,2,mean) ~ data$rtotal), col=2, lwd=2)

lb <- apply(output$x,2,quantile,.025)
ub <- apply(output$x,2,quantile,.975)

for(i in 1:n){
  lines(c(data$rtotal[i], data$rtotal[i]), c(lb[i], ub[i]), col="orange4", lwd=2)
}

points(data$rtotal, apply(output$x,2,mean), col="orange4", bg="orange", cex=1.2, pch=21)

# While an additive scale is not inconsistent with the latent variable we estimate, it is clear that there is
# much heterogeneity across raw scores in terms of our construct of interest, complaint fundamentals
# Additive scales assume equal weights across indicators (SCHNAKENBERG & FARISS 2013), whereas, as we demonstrate 
# below, some indicators 'matter' more than others.

# -------------------------------------------------------------------------------------#



# make an index, print it, re-order it, print it again
INDEX <- 1:n
INDEX
INDEX <- INDEX[order(apply(output$x,2,mean))]
INDEX



# -------------------------------------------------------------------------------------#

# Figure B2: Complaint Fundamentals: Posterior Estimates

# take the means of the posterior distributions, based on the rank order of the means
POSTERIORS <- apply(output$x,2,mean)[INDEX]

par(mar=c(4,4,1,1), font=2, font.lab=2, cex=1.3)
plot(POSTERIORS, 1:n, xlim=c(-3.75,3.75), ylab="Complaints", xlab="Posterior Estimates", yaxt="n")

# upper and lower bounds of the posterior distributions
lb <- apply(output$x,2,quantile,.025)[INDEX]
ub <- apply(output$x,2,quantile,.975)[INDEX]

# loop through each estimate and plot a line for the 95% credible interval
for(i in 1:n){
  lines(c(lb[i], ub[i]), c(i,i), col="orangered4", lwd=2)
}

# place the points on top of the credible intervals
points(POSTERIORS, 1:n, col="orangered4", bg="orangered", cex=1.2, pch=21)

# -------------------------------------------------------------------------------------#



# make an index, print it, re-order it, print it again
INDEX <- 1:k
INDEX
INDEX  <- INDEX[order(apply(output$alpha,2,mean))]
INDEX



# -------------------------------------------------------------------------------------#

# Figure B3: Complaint Fundamentals: Difficulty Parameters

# make the labels for the questions
ALPHA.LABELS <- names(data[-(1:3)])

# ALPHA.LABELS <- c("English", "Personal Testimony", "Petition", "Typed", "Raw Data", "Bank Policy Citation", "Bank Policy Violation")
ALPHA <- apply(output$alpha,2,mean)[INDEX]
LB.ALPHA <- apply(output$alpha,2,quantile,.025)[INDEX]
UB.ALPHA <- apply(output$alpha,2,quantile,.975)[INDEX]

par(mar=c(5, 0.5, 3, 10)) #bottom, left, top, right

barplot(ALPHA, horiz=T, space=0, xlim=c(-6.25,6.25), xlab="Difficulty Parameters")

for(i in order(ALPHA)){
  lines(c(LB.ALPHA[i], UB.ALPHA[i]), c(i-.5,i-.5), col=grey(.15), lwd=2)
}

axis(side=4, at=seq(.5, 7, 1), labels=ALPHA.LABELS[INDEX], las=2)

# -------------------------------------------------------------------------------------#



# make an index, print it, re-order it, print it again
INDEX <- 1:k
INDEX
INDEX  <- INDEX[order(apply(output$beta,2,mean))]
INDEX



# -------------------------------------------------------------------------------------#

#  Figure B4: Complaint Fundamentals: Discrimination Parameters

BETA <- apply(output$beta,2,mean)[INDEX]
LB.BETA <- apply(output$beta,2,quantile,.025)[INDEX]
UB.BETA <- apply(output$beta,2,quantile,.975)[INDEX]

par(mar=c(5, 1, 3, 9.5)) #bottom, left, top, right

barplot(BETA, horiz=T, space=0, xlim=c(-4.25,4.25), xlab="Discrimination Parameters")

for(i in order(BETA)){
  lines(c(LB.BETA[i], UB.BETA[i]), c(i-.5,i-.5), col=grey(.15), lwd=2)
}

axis(side=4, at=seq(.5, 7, 1), labels=ALPHA.LABELS[INDEX], las=2)

# -------------------------------------------------------------------------------------#



# -------------------------------------------------------------------------------------#

# plot the difference between the least discriminator question and the other 9 questions
BETA.DIFF <- output$beta - output$beta[,2]

# make an index, print it, re-order it, print it again
INDEX <- 1:k
INDEX
INDEX  <- INDEX[order(apply(BETA.DIFF,2,mean))]
INDEX

BETA <- apply(BETA.DIFF,2,mean)[INDEX]
LB.BETA <- apply(BETA.DIFF,2,quantile,.025)[INDEX]
UB.BETA <- apply(BETA.DIFF,2,quantile,.975)[INDEX]

par(mar=c(5, 1, 3, 9.5)) #bottom, left, top, right

barplot(BETA, horiz=T, space=0, xlim=c(-2.25,4.25), main="Complaint Fundamentals", xlab="Difference in Discrimination Parameter")

for(i in order(BETA)){
  lines(c(LB.BETA[i], UB.BETA[i]), c(i-.5,i-.5), col=grey(.15), lwd=2)
}

axis(side=4, at=seq(.5, 7, 1), labels=ALPHA.LABELS[INDEX], las=2)

TEST <- NA
for(i in 1:k){
  TEST[i] <- mean(BETA.DIFF[,i]>0)
}

axis(side=2, at=seq(.5, 7, 1), labels=round(TEST, digits=3)[INDEX], las=2, tick=F, line=-3)
mtext("Probability of Difference in Discrimination",side=2, line=1, cex=1.5)

# ***Due to space constraints, this figure is not included in the paper*** #
#    We include it here for readers' perusal and re-use of our code.       #

# -------------------------------------------------------------------------------------#



# make an index, print it, re-order it, print it again
INDEX <- 1:k
INDEX
INDEX  <- INDEX[order(apply(output$alpha,2,mean))]
INDEX



# -------------------------------------------------------------------------------------#

# Figure B5: Complaint Fundamentals: Discrimination and Difficulty

par(mfrow=c(3,3), mar=c(2.5,1,0.5,1))

plot(0,0, xlim=c(0,1), ylim=c(0,1), type="n", yaxt="n", xaxt="n", yaxt="n", xlab="", ylab="", bty="n")
mtext(expression(Pr(correct[j])), cex=1.5, line=-6)


for(j in INDEX){
  x <- seq(from=-3.0, to=3.5, by=.01)
  
  values <- 1 / (1+exp(-(output$alpha[,j] + output$beta[,j] %*% t(x))))
  
  plot(x, values[1,], ylim=c(0,1.275), type="n", lwd=2.0, col="orange", xaxt="n", yaxt="n", xlab="", ylab="", xlim=c(-4.00,3.25))
  mtext(ALPHA.LABELS[j], side=3, line=-3, cex=1, at=-1.1)
  axis(side=1, at=c(-3,-2,-1,0,1,2,3))
  axis(side=2, at=c(0.01,.2,.4,.6,.8,1,1.2), labels=c("0.0","0.2","0.4","0.6","0.8","1.0","Pr()"), tick=F, pos=-2.9, las=2, cex.axis=1.2)
  #axis(side=2, at=c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1,1.2), labels=c("0.0","0.1","0.2","0.3","0.4","0.5","0.6","0.7","0.8","0.9","1.0"," Pr()"), tick=F, pos=-3.25, las=2, cex.axis=.8)
  lines(x, apply(values,2,quantile, .05), ylim=c(0,1), type="l", lwd=2.0, col="darkorange")
  lines(x, apply(values,2,quantile, .95), ylim=c(0,1), type="l", lwd=2.0, col="darkorange")
  lines(x, apply(values,2,mean), ylim=c(0,1), type="l", lwd=2.0, col="darkred")
  
}